Article searching and Restaurant Matching

實作簡易的文章搜尋系統,以及標題餐廳推薦

第一部分 文章搜尋系統

引入Json檔,並且將所需資料(類別,URL,名稱)


In [1]:
import copy
from pyspark.sql import SQLContext
import json

sqlContext = SQLContext(sc)
df = sqlContext.jsonFile("./spark_tutorial_article.json")

gf = df.map(lambda x : (x[2],x[5],x[12]))

print type(gf)
#spark.read.json(sc.wholeTextFiles('./spark_tutorial_article.json').values())


<class 'pyspark.rdd.PipelinedRDD'>

In [2]:
#sc.textFile("./spark_tutorial_article.json").map(json.loads).take(1)[0][u'author']

用BeautifulSoup擷取內容,並套用Jieba斷詞


In [3]:
## getContent: for input aritcle, get it own word set via jieba.cut()
def getContent(x):
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(x)
    text = soup.getText().replace('\n','').replace('\r','').replace(' ','').replace('\t','')
    import jieba
    r = list()
    for term in jieba.cut(text):
        if len(term) > 1 and checkword(term): r.append(term)
    return r

def checkword(x):
    return all(u'\u4e00' <= c <= u'\u9fff' for c in x)

In [4]:
text_token = gf.map(lambda x: (x[0], getContent(x[1]), x[2]))

In [150]:
#text_token.first()
#text_token.first()
#text_token.count()

計算每篇文章的TF-IDF Vector


In [6]:
def cal_tf(tokens):
    d = {}
    for word in tokens:
        if not word in d:
            d[word] = 1
        else:
            d[word] += 1
    for word in d:
        d[word] = float(d[word])/len(tokens)
    return d

text_token_tf = text_token.map(lambda x: cal_tf(x[1]))

In [7]:
#check text_token_tf
#text_token_tf.first()

In [8]:
def cal_idf(docs):
    N = docs.count()
    uniqueTokens = docs.map(lambda x : list(set(x[1])))
    token_sum_tuples = uniqueTokens.flatMap(lambda x: x).map(lambda x: (x, 1)).reduceByKey(lambda x,y: x+y)
    return token_sum_tuples.map(lambda x : (x[0], float(N)/x[1]))

In [9]:
def TFIDF(tokens, idfs):
    tfidf_Dict = {}
    tfs = cal_tf(tokens)
    for tk in tfs:
        tfs[tk] = tfs[tk]*idfs[tk]
    tfidf_Dict = tfs
    return tfidf_Dict

In [10]:
doc_idfs = cal_idf(text_token)

doc_c = doc_idfs.collectAsMap()  #my idf dict

text_tfidf = TFIDF(text_token.collect()[0][1], doc_c)

print text_token.collect()[0][0]


美味食記

In [11]:
#check text_tfidf
#text_tfidf
#text_token.collect()[0][1]

計算Cosine similarity


In [12]:
import math

def dotprod(a, b):
    dotsum = 0
    for tk in a:
        if tk in b:
            dotsum += a[tk]*b[tk]
    return dotsum

def norm(a):
    return math.sqrt(dotprod(a,a))

def cossim(a, b):
    return dotprod(a,b)/(norm(a) * norm(b))

In [13]:
def cosineSimilarity(string1, string2, idfsDictionary):
    w1 = tfidf(string1, idfsDictionary)
    w2 = tfidf(string2, idfsDictionary)
    return cossim(w1, w2)

Rule One - top words in a text


In [14]:
def showTopWord(link):
    tokens = text_token.filter(lambda x: x[2] == link).collect()[0][1]
    tokens_weights = TFIDF(tokens, doc_c)
    print type(tokens_weights)
    tokens_weights_sorted = sorted(tokens_weights, key=tokens_weights.get, reverse=True)
    for index in range(0,9):
        print tokens_weights_sorted[index], tokens_weights[tokens_weights_sorted[index]]
    print tokens_weights_sorted[:14]
    return tokens_weights_sorted[:14]

In [15]:
link = u'http://lovecc6.pixnet.net/blog/post/73513867'
#showTopWord(link)

In [16]:
urls = text_token.map(lambda x : x[2])


#top_word_list = text_token.map(lambda x : showTopWord(x[2]))
#top_word_list = [showTopWord(i) for i in urls]
#top_word_list = urls.map(lambda x: showTopWord(x))
#top_word_list

In [17]:
#top_word_list

Rule Two - Query in text


In [18]:
query_input = [u'蝦球', u'辣味', u'泰式']

def check_in(query, text):
    count = 0
    for q in query:
        if q in text:
            count += 1
    return count

def query_points(query):
    query_points_table = text_token.map(lambda x : check_in(query, x[1]))
    return query_points_table

In [19]:
query_pts = query_points(query_input).collect()

len(query_pts)


Out[19]:
2228

Rule 3 - Term Weights


In [20]:
def term_weights(tokens):
    d = {}
    for word in tokens:
        if not word in d:
            d[word] = 1
        else:
            d[word] += 1
    return d

In [21]:
def term_points(query, point_dict):
    points = 0
    for i in query:
        if i in point_dict:
            points += point_dict[i]
                
    return points

tf_list = text_token.map(lambda x : term_weights(x[1])).collect()

In [22]:
term_pts = [term_points(query_input, i) for i in tf_list]
len(term_pts)


Out[22]:
2228

計算文章分數


In [23]:
def doc_points(term_weight_pts, query_pts):
#    tw_dict = text_token.map(lambda x: term_weights(x[1])).collect()
#    doc_point = text_token.map(lambda x : (((term_points(query_input, tw_dict))*(check_in(query_input, x[1])) , x[2])))
    doc_point = [i*j for i,j in zip(term_weight_pts, query_pts)]
    
    return doc_point

In [24]:
url_list = text_token.map(lambda x : (x[2]))

total_pts = zip(doc_points(term_pts, query_pts) , url_list.collect())

In [25]:
#print type(total_pts)
total_pts_sort = sorted(total_pts, reverse=True)
#total_pts_sort

In [26]:
total_pts_sort[:10]


Out[26]:
[(60, u'http://changfong.pixnet.net/blog/post/40749658'),
 (34, u'http://evisko.pixnet.net/blog/post/258052708'),
 (29, u'http://justnike.pixnet.net/blog/post/61919500'),
 (28, u'http://wonderfood.pixnet.net/blog/post/198089649'),
 (28, u'http://lemonadellen.pixnet.net/blog/post/32114431'),
 (25, u'http://changfong.pixnet.net/blog/post/41828851'),
 (24,
  u'http://sedo888.pixnet.net/blog/post/341765034-%5b%e5%8f%b0%e5%8c%97%5d-%e5%96%9c%e4%be%86%e7%99%bb%e5%a4%a7%e9%a3%af%e5%ba%97-%e2%80%a7-sukhothai%e8%98%87%e5%8f%af%e6%b3%b0%e6%b3%b0%e5%bc%8f%e6%96%99'),
 (24, u'http://lemonadellen.pixnet.net/blog/post/40637716'),
 (20, u'http://protozoa.pixnet.net/blog/post/29765279'),
 (20, u'http://infinite520visa.pixnet.net/blog/post/148834846')]

第二部分 - 實作餐廳的 matching

用函式尋找完全配對,回傳分數


In [27]:
def exact_match(restaurant, title):
    if len(restaurant) < 3:
        return 0 
    exact_match_flag = 0;
    if restaurant in title:
        exact_match_pts = 1
        return 1
    else:
        return 0

將標題斷詞,剔除不需要字元


In [176]:
def title_checkword(x):
    return all((u'\u4e00' <= c <= u'\u9fff') or ('A' <= c <= 'Z') or ('a' <= c <= 'z') or (
            '0' <= c <= '9')for c in x)

def cut_title(title):
    import jieba
    
    r = list()
    for term in jieba.cut(title):
        if title_checkword(term): r.append(term)
    return r

測試斷詞函式


In [181]:
cut_title(article_info.first()[0])


Out[181]:
[u'\u559c\u4f86',
 u'\u767b',
 u'\u4e4b',
 u'\u5341\u4e8c',
 u'\u5eda',
 u'All',
 u'u',
 u'can',
 u'eat']

取得標題和URL


In [151]:
article_info = df.map(lambda x : (x[11],x[12]))

print type(article_info)
article_info.first()


<class 'pyspark.rdd.PipelinedRDD'>
(u'\u559c\u4f86\u767b\u4e4b\u5341\u4e8c\u5eda   All u can eat ', u'http://louis740321.pixnet.net/blog/post/373737533')

將斷過的詞兩兩相接


In [182]:
def bio_wordset(words):
    try:
        biogram_str = map(lambda x, y: x+y, words[:-1], words[1:])
        return biogram_str
    except:
        return []

In [183]:
bio_test_1 = cut_title(article_info.first()[0])
print type(bio_test_1)
bio_test_2 = bio_wordset(bio_test_1)
print bio_test_2


<type 'list'>
[u'\u559c\u4f86\u767b', u'\u767b\u4e4b', u'\u4e4b\u5341\u4e8c', u'\u5341\u4e8c\u5eda', u'\u5edaAll', u'Allu', u'ucan', u'caneat']

取得餐廳名稱資料 數字替換資料 同義詞替換資料


In [184]:
import pandas as pd

res = sc.textFile('./restaurant.csv').map(lambda line: line.split(',')).map(
    lambda line: line[2]).collect()[1:]

ex_digit = sc.textFile('./exchange_word.csv').map(lambda line: line.split(',')).map(
    lambda line: (line[0],line[1])).filter(lambda x: x[0].isdigit()).collect()

ex_word = sc.textFile('./exchange_word.csv').map(lambda line: line.split(',')).map(
    lambda line: (line[0],line[1])).filter(lambda x: not (x[0].isdigit())).collect()

res_data = list(set(res))

In [185]:
print ex_word
print ex_digit


[(u'\u71d2\u8089', u'\u71d2\u70e4'), (u'\u98ef\u5e97', u'\u9152\u5e97'), (u'\u5403\u5230\u98fd', u'\u81ea\u52a9\u9910\u5ef3'), (u'\u71d2\u70e4', u'\u71d2\u8089'), (u'buffet', u'\u81ea\u52a9\u9910\u5ef3')]
[(u'2', u'\u4e8c'), (u'3', u'\u4e09'), (u'4', u'\u56db'), (u'5', u'\u4e94'), (u'6', u'\u516d'), (u'7', u'\u4e03'), (u'8', u'\u516b'), (u'9', u'\u4e5d'), (u'10', u'\u5341'), (u'1', u'\u4e00')]

檢查是否有同義詞出現


In [157]:
def check_word(title_str):
    for w in ex_word:
        if w[0] in title_str:
            return True
    return False

替換同義詞


In [96]:
def change_word(title_str):
    for w in ex_word:
        if w[0] in title_str:
            return title_str.replace(w[0], w[1])
    return title_str

In [170]:
#需要丟入斷過詞的title list
def change_list(input_list):
    result = ''.join([change_word(i) for i in input_list])
    return result

In [188]:
change_list_test = u'32比較公雞燒肉'
change_list_test = change_list(cut_title(change_list_test))
print change_list_test


32比較公雞燒烤

替換數字


In [186]:
def change_num(input_str):
    output_str = ''
    for index, i in enumerate(input_str):
        count = 0
        for j in ex_digit:
            if i == j[0]:
                count += 1
                output_str += j[1]
                try:
                    if input_str[index+1].isdigit():
                        output_str += u'十'
                except:
                    continue
        if count == 0:
            output_str += i
    return output_str

In [187]:
change_num_test = u'32比較公雞迴轉'
change_num_test = change_num(change_num_test)
print change_num_test


三十二比較公雞迴轉

檢查餐廳種類資料


In [189]:
#for i in res_data:
#    print i

整理出長度大於2的詞


In [190]:
def creat_long(short_str):
    result = [p for p in short_str if (len(p) > 1)]
    return result

In [191]:
creat_long(cut_title(article_info.first()[0]))


Out[191]:
[u'\u559c\u4f86', u'\u5341\u4e8c', u'All', u'can', u'eat']

將英文詞分割出來


In [192]:
def separate_eng(input_str):
    result = list()
    for i in input_str:
        if i.isalpha() and (('A' <= i[0] <= 'Z') or ('a' <= i[0] <= 'z')):
            result.append(i)
    return result

In [193]:
aa = separate_eng(cut_title(article_info.first()[0]))
#print aa
#print article_info.collect()

長詞的比較


In [194]:
def long_term_compare(title, name):
    count = 0
    for i in title:
        if i in name:
            count += 1
    return count

In [195]:
def bio_long_term_compare(title, name):
    count = 0
    for i in title:
        if i in name:
            count += 1
    return count

包含全部詞的比較


In [196]:
def term_compare(title, name):
    term_count = 0
    for i in title:
        if i in name:
            term_count += 1
    return term_count

In [197]:
def bio_term_compare(title, name):
    term_count = 0
    for i in title:
        if i in name:
            term_count += 1
    return term_count

In [198]:
q1 = ['豆腐','好臭','誰的','好香','滴油']
q2 = ['豆腐','好臭','誰的']
term_compare(q1, q2)


Out[198]:
3

導入計分function


In [200]:
#將六個參數帶入計分 並且儲存計分分佈 分別為: 短詞比對,短詞相接比對,長詞比對,長詞相接比對,英文比對,完全比對

def calculate_pts(short_uni, short_bio, long_uni, long_bio, eng_name, exact, pts_record):
    a = list()
    a.append(short_uni)
    a.append(short_bio)
    a.append(long_uni)
    a.append(long_bio)
    a.append(eng_name)
    a.append(exact)
    pts_record.append(a)
    return ((1*short_uni)+(2*short_bio)+(1*long_uni)+(4*long_bio)+(4*eng_name)+(100000*exact))

實作預測函式


In [135]:
def predict_restaurant(title):
    
    #將需要的字串從標題斷出來
    title_token = cut_title(change_num(title))
    biogram_title_token = bio_wordset(title_token)
    long_title_token = creat_long(title_token)
    long_biogram_title_token = bio_wordset(long_title_token)
    Eng_title_token = separate_eng(title_token)
    
    pts_list = list()
    pts_record = list()
    exact_pts = 0
    
    for i in res_data:
        
        #對每個餐廳名做需要的處理
        exact_pts = exact_match(i, title)
        name_token = cut_title(change_num(i))
        biogram_name_token = bio_wordset(name_token)
        long_name_token = creat_long(name_token)
        long_biogram_name_token = bio_wordset(long_name_token)
        Eng_name_token = separate_eng(name_token)

        #計算參數
        short_uni = term_compare(title_token, name_token)
        short_bio = bio_term_compare(biogram_title_token,biogram_name_token)
        long_uni = long_term_compare(long_title_token, long_name_token)
        long_bio = bio_long_term_compare(long_biogram_title_token, long_biogram_name_token)
        eng_name = term_compare(Eng_title_token, Eng_name_token)

        #導入計分函式
        pts_list.append(calculate_pts(
            short_uni, short_bio, long_uni, long_bio, eng_name, exact_pts, pts_record))#
    
    #整理取前三名
    rank_list = zip(pts_list, res_data, pts_record)
    rank_list_sorted = sorted(rank_list, reverse = True)
    return rank_list_sorted[:3]

對所有文章的標題進行預測


In [201]:
last_test = article_info.map(lambda x: x[0]).map(lambda x : predict_restaurant(x))

快速檢查斷詞比對


In [59]:
rr = [u'【小宅食記】喜來登kitchen 12早餐吃到飽|美好一天的開始:台北市中正區',u'雲軒西餐廳 La Rotisserie - 君品酒店',
u'非凡大探索-吃到飽-喜來登十二廚下午茶',u'十二廚自助餐廳 - 台北喜來登大飯店',u'槿韓食堂 -韓式料理吃到飽(1F)']

In [117]:
def test_2(inin):
    for i in inin:
        print '======================'
        title_token = cut_title(i)
        for j in title_token:
            print j
        print('---------------')
        biogram_title_token = bio_wordset(title_token)
        for j in biogram_title_token:
            print j
        print('---------------')
        long_title_token = creat_long(title_token)
        for j in long_title_token:
            print j
        print('---------------')
        long_biogram_title_token = bio_wordset(long_title_token)
        for j in long_biogram_title_token:
            print j
        print('---------------')
        Eng_title_token = separate_eng(title_token)
        for j in Eng_title_token:
            print j
        print('---------------')

In [202]:
#test_2(rr)

將預測結果導成list


In [203]:
predict_result = last_test.collect()

In [205]:
#predict_result

檢查每筆前三名餐廳


In [3]:
#for index,i in enumerate(qqq):
#    print index
#    for j in i:
#        print j[1]

列出每筆標題 配對餐廳 分數分佈


In [209]:
def evaluation(title, prediction):
    count = 0
    for a, b in zip(title, prediction):
        print count
        print ('===========================')
        print a
        print b[0][1]
        print b[0][2]
        print ('===========================')
        count += 1

In [210]:
evaluation(last_test_1, qqq)

In [ ]: